data = pd.read_csv('../lab_04/videos_data.csv')
data['Likes_numeric'] = data['Likes'].str.replace(',', '').astype(int)
profile = ProfileReport(data, title="Pandas Profiling Report")
profile.to_widgets()Advanced Computing for Policy
Task:
data = pd.read_csv('../lab_04/videos_data.csv')
data['Likes_numeric'] = data['Likes'].str.replace(',', '').astype(int)
profile = ProfileReport(data, title="Pandas Profiling Report")
profile.to_widgets()def check_numeric(data, column):
assert data[column].dtype in ['int64', 'float64'], f"{column} is not numeric"
cols = ['Rank', 'Likes', 'Dislikes']
for col in cols:
check_numeric(data, col)--------------------------------------------------------------------------- AssertionError Traceback (most recent call last) Cell In[8], line 6 4 cols = ['Rank', 'Likes', 'Dislikes'] 5 for col in cols: ----> 6 check_numeric(data, col) Cell In[8], line 2, in check_numeric(data, column) 1 def check_numeric(data, column): ----> 2 assert data[column].dtype in ['int64', 'float64'], f"{column} is not numeric" AssertionError: Likes is not numeric
def is_outlier(value,q1,q3):
iqr = q3 - q1 # Interquartile range
lower_bound = q1 - 1.5 * iqr
upper_bound = q3 + 1.5 * iqr
return value < lower_bound or value > upper_bound
def column_has_outliers(data, column):
q1 = data[column].quantile(0.25) # First quartile
q3 = data[column].quantile(0.75) # Third quartile
return any(data[column].apply(lambda x: is_outlier(x, q1, q3)))
assert not column_has_outliers(data, 'Likes_numeric'), "Likes has outliers"--------------------------------------------------------------------------- AssertionError Traceback (most recent call last) Cell In[9], line 12 9 q3 = data[column].quantile(0.75) # Third quartile 10 return any(data[column].apply(lambda x: is_outlier(x, q1, q3))) ---> 12 assert not column_has_outliers(data, 'Likes_numeric'), "Likes has outliers" AssertionError: Likes has outliers
import numpy as np
import pandas as pd
def simulate_data(n):
x = np.random.uniform(0, 1, n)
y = 2 + 3 * x + np.random.normal(0, 1, n)
return x, y
from matplotlib import pyplot as plt
def plot_data(x, y):
width = 100
height = 100
plt.scatter(x, y)
plt.xlabel('x')
plt.ylabel('y')
plt.show()
plot_data(*simulate_data(100))You’re going to set up your tests and linting to run automatically every time you push code to GitHub.
This is one of those times where you’ll follow instructions without necessarily knowing what’s going on
name: Run tests
on: push
jobs:
tests:
runs-on: ubuntu-latest
steps:
- name: Clone repository
uses: actions/checkout@v4
# https://github.com/actions/setup-python
- name: Install Python
uses: actions/setup-python@v5
with:
python-version: "3.12"
cache: pip
- name: Install dependencies
run: pip install -r requirements.txt
- name: Run tests
# https://pytest-cov.readthedocs.io/en/latest/readme.html
run: pytest --cov
# https://github.com/astral-sh/ruff-action
- name: Run ruff
uses: astral-sh/ruff-action@v3
with:
version: latest.github/workflows/tests.yml.